#------------------------------------------------------------------------------
# here's the collection of functions that are used to run data processing and statistical analysis
#==============================================================================

edge_measure_random_entire_peer = function(dep_data,include_self=FALSE, n_sim=1000,n_core=4, type='grade'){

  # --- filter
  dep_data[grade == 13 | grade == 6, grade := NA]
  dep_data = dep_data[!is.na(grade)&!is.na(scid),]

  # --- generate school-grade id 
  setorder(dep_data, scid, grade)

  dep_data[,scid_grade := paste0(scid,":",grade)]

  dep_data[, `:=`(
        qt05 = quantile(depress,0.05,type = 7, na.rm=TRUE),
        qt10 = quantile(depress,0.10,type = 7, na.rm=TRUE),
        qt90 = quantile(depress,0.90,type = 7, na.rm=TRUE),
        qt95 = quantile(depress,0.95,type = 7, na.rm=TRUE)
        )]  

  peer_depress = dep_data[,.(
    peer_depress_median = median(depress,na.rm=TRUE),
    peer_depress_q05 = mean(depress <= qt05,na.rm=TRUE),
    peer_depress_q95 = mean(depress >= qt95,na.rm=TRUE)
    ), by=c('scid','grade')]

  peer_depress[,scid_grade := paste0(scid,":",grade)]

  list_peer_depress = mclapply(1:n_sim, function(i){
    message('now constructing random grade peer ',i)
    permute_data = dep_data 
    permute_data[, random_grade_within_school := sample(grade), by='scid']  
    permute_data[,scid_grade := paste0(scid,":",random_grade_within_school)]
  
    data_out = merge(x = permute_data[,c('aid','scid_grade','depress')], y = peer_depress, by=c('scid_grade'), all.x=TRUE)

    data_out[,rank_depress := frank(depress, ties.method= 'min'), by=c('scid_grade')]
    data_out[,max_rank_depress := max(rank_depress), by=c('scid_grade')]
    data_out[,prank_depress := rank_depress / max_rank_depress]

    data_out[,max_rank_depress := NULL]

    write_fst(data_out,file.path(data_path,'processed','simulation_all',type,paste0('sim_',type,i,'.fst')),100)

  }, mc.cores = n_core)

  return(NULL)
}

run_FE_random_peer = function(random_peer_depress_grade, 
                         file_name=NULL, n_core = 4,
                         peer_hetero = c('peer_depress_q95','peer_depress_q05'),
                         peer_center = 'peer_depress_median',
                         cluster='scid',
                         group_fixed_effect = 'scid'){

ind_controls = c('female','black','hispanic','other','immig_1st','immig_2nd',
  'family_step','family_one','family_other','pvt','pa_educ','assistance','sibsize')
  
  reg_data = combine_data(key='aid',list_data_tables = list(
    processed_data_wave1, processed_data_wave2, weight_wave1, weight_wave2, 
    cleaned_data_inschool, processed_data_depress,random_peer_depress_grade))

  reg_data[!is.na(gswgt2) & !is.na(gswgt1),out_sample := 0L]
  # reg_data[out_sample == 0L & is.na(sqid), out_sample := 1L]
  reg_data[out_sample == 0L & (is.na(peer_depress_median)|is.na(peer_depress_q05)|is.na(peer_depress_q95)), out_sample := 2L]
  reg_data[out_sample == 0L & (is.na(fcesd_2)), out_sample := 3L]
  reg_data[out_sample == 0L & (is.na(depress)), out_sample := 5L]
  reg_data[out_sample == 0L & (is.na(parent_attachment)), out_sample := 6L]
  reg_data[out_sample == 0L & !complete.cases(reg_data[,..ind_controls]), out_sample := 8L]

  # adjust peer measures!
  adjust_var = grep('peer_|\\.mean',names(reg_data),value=TRUE)
  adjust_var = grep('pa_educ.mean',adjust_var,invert = TRUE,value=TRUE)

  adjust_var = grep('peer_depress_median',adjust_var,invert = TRUE,value=TRUE)
  adjust_var = c(adjust_var,'prank_depress')

  for (var in adjust_var)  reg_data[, (var) := get(var)*100]

  # ------- run the model 
  m5 = run_two_way(reg_data=reg_data[out_sample==0,], 
  DV='fcesd_2',
  peer_IVs = c('peer_depress_median','peer_depress_q95','peer_depress_q05'),
  controls=NULL,
  #controls = c('individual','lag'),
  default_controls = NULL,
  weight = 'gswgt2',
  group = 'scid',
  cluster = cluster)
  coef = coef(summary(m5))
  coef_ci = confint(m5)
  names(coef_ci) = c('coef_lci','coef_uci')
  coef_m5 = data.table(coef_name=rownames(coef),coef,coef_ci)
  return(coef_m5)
}


create_twomode_edge = function(ids,dt){
  dt = as.matrix(dt)
  rownames(dt) = ids 
  tmp <- as(dt, "dgTMatrix")    
  edges <- data.frame(P=rownames(dt)[tmp@i + 1], Q=colnames(dt)[tmp@j + 1])
  g = graph.data.frame(edges)
  V(g)$type = bipartite_mapping(g)$type
  g = bipartite_projection(g,which=FALSE)    
  return(data.table(get.edgelist(g)))
}

create_full_edge = function(ids){
  n = length(ids)
  g = make_full_graph(n,directed= TRUE)
  V(g)$name = ids 
  return(data.table(get.edgelist(g)))
}

edge_peer_grade = function(data){

  # --- filter
  data[grade == 13 | grade == 6, grade := NA]
  data[,scid := as.numeric(sschlcde)]  
  data = data[!is.na(grade)&!is.na(scid),]

  # --- generate school-grade id 
  setorder(data, scid,grade)
  data[,scid_grade := paste0(scid,":",grade)]

  # --- use it as two mode networks
  scdata = data[, c('aid','scid_grade')]
  scdata = scdata[aid != '' & scid_grade != '',]

  graph_data = scdata[, .(edge_list = 
      list(create_full_edge(aid))),by=scid_grade]

  all_edgelist = rbindlist(graph_data[,edge_list])
  setnames(all_edgelist, c('V1','V2'), c('aid','peer_aid'))

  return(all_edgelist)
}

edge_peer_friendship = function(friendship_data, inschool_data, same_sex = "all", exclude_self=TRUE, direction='out'){

  female_data = inschool_data[,c('aid',"s2")]
  female_data[,female := ifelse(s2 == 2, 1, 0)]
  female_data[,s2:=NULL]

  names(friendship_data) = tolower(names(friendship_data))

  edgelist = melt(friendship_data, measure = list(paste0("mf",1:5,"aid"), paste0("ff",1:5,"aid")), 
    value.name = c("mfaid", "ffaid"), variable.name = 'order')
  edgelist = melt(edgelist, measure = list(c('mfaid','ffaid')), value.name='aid',variable.name='gender')
    setnames(edgelist, c('aid'),c('peer_aid'))
    edgelist = edgelist[sqid != 999999,]
    edgelist = na.omit(edgelist)

  edgelist = merge(x=edgelist, y=inschool_data[,c('sqid','aid')], by='sqid',all.x=TRUE)
    invalid_codes = c(77777777,88888888,99999999,99959995)
    edgelist = edgelist[!(peer_aid %in% invalid_codes), ]
    edgelist[,peer_aid := as.character(peer_aid)]
  edgelist = merge(x=edgelist, y=female_data, by.x='aid', by.y='aid',all.x=TRUE)
  edgelist = merge(x=edgelist, y=female_data, by.x='peer_aid', by.y='aid',all.x=TRUE)

  edgelist[,same_sex := as.integer(female.x == female.y)]
  edgelist = edgelist[,c('aid','peer_aid','same_sex')]
  
  # exclude self-nomination 
  if (exclude_self == TRUE) {
    edgelist = edgelist[aid != peer_aid,]
  }

  # filter by same-sex and opposite-sex friendship
  if (same_sex == TRUE){
    edgelist = edgelist[same_sex == 1, ]
  } else if (same_sex == FALSE){
    edgelist = edgelist[same_sex == 0, ]
  } else {
    # nothing happens
  }
    edgelist = edgelist[,c('aid','peer_aid')]

  # consider friendship direction 
    if (direction == 'out'){
      # nothing happens
    } else if (direction == 'in') {
      names(edgelist) = c('peer_aid','aid')
    } else if (direction == 'all') {
      edgelist_in = edgelist
      names(edgelist_in) = c('peer_aid','aid')
      edgelist = rbind(edgelist,edgelist_in)
    } else if (direction == 'both') {
      edgelist_in = edgelist
      names(edgelist_in) = c('peer_aid','aid')
      edgelist = rbind(edgelist,edgelist_in)
      edgelist[, n_tie := .N, by=c('aid','peer_aid')]
      edgelist = unique(edgelist[n_tie == 2,])
        edgelist[, n_tie := NULL]
    } else {
      stop("specify direction type")
    }

  return(edgelist)
}


edge_peer_course_network = function(data, filter = 94, type = 'W|re', course_group = NULL){

  names(data) = tolower(names(data))

  # --- filter
  if (filter != 'all'){
    data = data[enyrlp == filter,]  
  } 
  
  # school id
  data[,scid := abs(as.numeric(ensclid))]  

  # group id
  if (type == 'W|re') group_var = 'encombcw'
  if (type == 'U|re') group_var = 'encombcu'
  if (type == 'W|one') group_var = 'enclustw'
  if (type == 'U|one') group_var = 'enclustu'

  data[,group := get(group_var)]
  data = data[!is.na(group)&!is.na(scid),]

  # --- generate school-grade id 
  setorder(data, scid,group)
  data[,scid_group := paste0(scid,":",group)]

    # --- use it as two mode networks
  scdata = data[, c('aid','scid_group')]
  scdata = scdata[aid != '' & scid_group != '',]
  scdata = unique(scdata)

  graph_data = scdata[, .(edge_list = 
      list(create_full_edge(aid))),by=scid_group]

  all_edgelist = rbindlist(graph_data[,edge_list])
  setnames(all_edgelist, c('V1','V2'), c('aid','peer_aid'))
  #all_edgelist = merge(x=all_edgelist, y = scdata, by.x=c('aid'), by.y=c('aid'),all.x=TRUE)
  if (is.null(course_group)){
    return(all_edgelist)  
  } else {
    scdata = data[, c('aid','scid','scid_group')]
    scdata = scdata[aid != '' & scid_group != '',]
    scdata = unique(scdata)
    return(scdata)
  }
  
}


edge_peer_club = function(club_data){

  club_data = club_data[s44 == 0,-"s44"]
  club_data = club_data[aid != '',]
  setorder(club_data,scid,aid)

  graph_data = club_data[, .(edge_list = 
    list(create_twomode_edge(aid,.SD))),.SDcols=paste0("s44a",1:33),by=scid]
  
  edgelist = rbindlist(graph_data[,edge_list])
  setnames(edgelist, c('V1','V2'), c('aid','peer_aid'))

  return(edgelist)
}



process_inschool = function(processed_data){

  # ---- some more data processing 
  processed_data[,`:=`(
    parent_usborn = 1 - parent_foreign,
    self_usborn = usborn,
    pa_educ = pa_maxeduc,
    immigration = car::recode(immigrant, c('4=3'))
    )]
  processed_data[, `:=`(
    immig_1st = as.integer(immigration == 1),
    immig_2nd = as.integer(immigration == 2),
    immig_3rd = as.integer(immigration == 3),

    family_two = as.integer(family == 1),
    family_one = as.integer(family %in% c(2,3)),
    family_other = as.integer(family %in% c(1,2,3) == FALSE))]
  processed_data[sex != 0,female := as.integer(sex == 2)]
  processed_data[,depress := NULL]

  return(processed_data)
}

impute_aid_inschool = function(data){
  # --- impute arbitrary aid into the data
  data[aid =='', aid := as.character(10000000+seq(.N))]
  return(data)
}

process_grade = function(raw_data){
  raw_data[,scid := as.numeric(sschlcde)]  
  raw_data[s3 >= 7 & s3 <= 12, grade := s3]
  return(raw_data)
}

process_club = function(raw_data){
  raw_data[,scid := as.numeric(sschlcde)]  
  club_data = raw_data[,c('aid','scid',grep('s44',names(raw_data),value=TRUE)),with=FALSE]
  return(club_data)
}

process_depress = function(raw_data, dep_indicators){
  raw_data[,scid := as.numeric(sschlcde)]  
  raw_data[s3 >= 7 & s3 <= 12, grade := s3]

  dep_data = raw_data[,c('aid','scid','grade',dep_indicators),with=FALSE]
  dep_data[,depress := rowMeans(.SD,na.rm=TRUE), .SDcols = dep_indicators]
  return(dep_data[,c('aid','scid','grade','depress')])
}

measure_self_depress_rank = function(edgelist, dep_data){

  edgelist = edgelist[,c('aid','peer_aid')]
  edgelist_aid = unique(edgelist[,aid])
  edgelist = rbind(edgelist,data.table(aid=edgelist_aid, peer_aid=edgelist_aid))
  
  var = 'depress'
  attr_data = dep_data[,c('aid',var),with=FALSE]
  
  edgelist = merge(x=edgelist, y= attr_data, by.x=c('peer_aid'), by.y=c('aid'), all.x=TRUE)
  setorder(edgelist,aid,depress)
  
  edgelist = na.omit(edgelist)
    edgelist[,rank_depress := frank(depress, ties.method= 'min'), by='aid']
    edgelist[,max_rank_depress := max(rank_depress), by='aid']
    edgelist[,prank_depress := rank_depress / max_rank_depress]
  rank_depress = edgelist[aid == peer_aid,c('aid','rank_depress','prank_depress')]

  return(rank_depress)
}

measure_peer_depress_bottom = function(edgelist, dep_data, include_self='include_self', filter='filter', reference='all'){

# calculate quantiles
if (filter == 'filter') {
  subsample_aids = unique(unlist(edgelist[,c('aid','peer_aid')]))
  filtered_row = na.omit(match(subsample_aids,dep_data[,aid]))
} else {
  filtered_row = 1:nrow(dep_data)
}
if (reference == 'all'){
  dep_data[filtered_row, `:=`(
      qt05 = quantile(depress,0.05,type = 7, na.rm=TRUE),
      qt10 = quantile(depress,0.10,type = 7, na.rm=TRUE),
      qt15 = quantile(depress,0.15,type = 7, na.rm=TRUE),
      qt20 = quantile(depress,0.20,type = 7, na.rm=TRUE),
      qt25 = quantile(depress,0.25,type = 7, na.rm=TRUE),
      qt90 = quantile(depress,0.90,type = 7, na.rm=TRUE),
      qt95 = quantile(depress,0.95,type = 7, na.rm=TRUE)
      )]  
} else if (reference == 'school') {
  dep_data[filtered_row, `:=`(
      qt05 = quantile(depress,0.05,type = 7, na.rm=TRUE),
      qt10 = quantile(depress,0.10,type = 7, na.rm=TRUE),
      qt15 = quantile(depress,0.15,type = 7, na.rm=TRUE),
      qt20 = quantile(depress,0.20,type = 7, na.rm=TRUE),
      qt25 = quantile(depress,0.25,type = 7, na.rm=TRUE),
      qt90 = quantile(depress,0.90,type = 7, na.rm=TRUE),
      qt95 = quantile(depress,0.95,type = 7, na.rm=TRUE)
      ), by=c('scid')]  
} else if (reference == 'grade'){
  dep_data[filtered_row, `:=`(
      qt05 = quantile(depress,0.05,type = 7, na.rm=TRUE),
      qt10 = quantile(depress,0.10,type = 7, na.rm=TRUE),
      qt15 = quantile(depress,0.15,type = 7, na.rm=TRUE),
      qt20 = quantile(depress,0.20,type = 7, na.rm=TRUE),
      qt25 = quantile(depress,0.25,type = 7, na.rm=TRUE),
      qt90 = quantile(depress,0.90,type = 7, na.rm=TRUE),
      qt95 = quantile(depress,0.95,type = 7, na.rm=TRUE)
      ), by=c('grade')]  
}

  var = c('depress','qt05','qt10','qt15','qt20','qt25','qt90','qt95')
  attr_data = dep_data[,c('aid',var),with=FALSE]
  
  edgelist = edgelist[,c('aid','peer_aid')]

    if (include_self == 'include_self'){
    edgelist_aid = unique(edgelist[,aid])
    edgelist = rbind(edgelist,data.table(aid=edgelist_aid, peer_aid=edgelist_aid))
  }
  edgelist = merge(x=edgelist, y= attr_data, by.x=c('peer_aid'), by.y=c('aid'), all.x=TRUE)

  peer_depress = edgelist[,.(
    peer_depress_mean = mean(depress,na.rm=TRUE),
    peer_depress_median = median(depress,na.rm=TRUE),
    peer_depress_q05 = mean(depress <= qt05,na.rm=TRUE),
    peer_depress_q10 = mean(depress <= qt10,na.rm=TRUE),
    peer_depress_q15 = mean(depress <= qt15,na.rm=TRUE),
    peer_depress_q20 = mean(depress <= qt20,na.rm=TRUE),
    peer_depress_q25 = mean(depress <= qt25,na.rm=TRUE),
    peer_depress_q90 = mean(depress >= qt90,na.rm=TRUE),
    peer_depress_q95 = mean(depress >= qt95,na.rm=TRUE),
    N_peer = .N
    ), by='aid']
  return(peer_depress)
}

measure_peer_control = function(edgelist, processed_data_inschool, control.varlist, include_self=FALSE){
  
  # ---- merge with attribute data
  attr_data = processed_data_inschool[,c('aid',control.varlist),with=FALSE]

  edgelist = edgelist[,c('aid','peer_aid')]
  if (include_self == TRUE){
    edgelist_aid = unique(edgelist[,aid])
    edgelist = rbind(edgelist,data.table(aid=edgelist_aid, peer_aid=edgelist_aid))
  }

  edgelist = merge(x=edgelist, y= attr_data, by.x=c('peer_aid'), by.y=c('aid'), all.x=TRUE)
  peer_mean <- edgelist[, sapply(.SD, function(x) list(mean = mean(x, na.rm=TRUE))), 
    .SDcols = control.varlist, by = 'aid']

  return(peer_mean)
}

combine_data = function(key, list_data_tables){
  for (i in 1:length(list_data_tables)) {
    list_data_tables[[i]] <- setkeyv(list_data_tables[[i]],key)
  }

  merged_data = Reduce(function(...) merge(..., all.x = TRUE), list_data_tables)

  # final process before regression
  merged_data[,`:=`(
	female = female_1, 
	
	white = as.integer(race_h_1 == 1),
	black = as.integer(race_h_1 == 2),
	hispanic = as.integer(race_h_1 == 3),
	other = as.integer(race_h_1 == 4),
	
	immig_1st = as.integer(immig_2_1 == 1),
	immig_2nd = as.integer(immig_2_1 == 2),
	immig_3rd = as.integer(immig_2_1 == 3),
	
	family_two = as.integer(famst5_1 == 1),
	family_step = as.integer(famst5_1 == 2),
	family_one = as.integer(famst5_1 == 6),
	family_other = as.integer(famst5_1 %in% c(7,8)),

	pa_educ = c_paeduc_max_1,
	pvt = pvt_std_1,
	sibsize = sibsize_1,
	assistance = family_pub_ass_1,

	parent_attachment = s_pattachment_1,
	school_attachment = s_sattachment_1,
  scid = scid_1
	) ]

  merged_data[female == 0, bfcesd_2 := ifelse(fcesd_2 > 22, 1, 0)]
  merged_data[female == 1, bfcesd_2 := ifelse(fcesd_2 > 24, 1, 0)]

  return(merged_data)
}





run_two_way = function(reg_data, DV, peer_IVs, group='scid', time= 'grade.x',controls = NULL, default_controls='N_peer',weight=NULL, cluster=NULL){

  detect_controls = function (x) {
    sum(grepl(x,controls)) > 0
  }

  if (detect_controls('individual')) {
    ind_controls = c('female','black','hispanic','other','immig_1st','immig_2nd',
      'family_step','family_one','family_other','pvt','pa_educ','assistance','sibsize')
  } else {
    ind_controls = NULL
  }
  if (detect_controls('attachment')){
    ind_attach = c('parent_attachment') 
  } else {
    ind_attach = NULL
  }
  
  if (detect_controls('lag')) {
    ind_lag = c('prank_depress')  
  } else {
    ind_lag = NULL
  }
  
  if (detect_controls('peer')){
    peer_controls = c(
      paste0(c('female','black','hispanic','other','immig_1st','immig_2nd',
      'family_one','family_other','pa_educ'),'.mean'))
  } else {
    peer_controls = NULL
  }

  controls = c(default_controls,ind_controls, peer_controls, ind_attach, ind_lag)
  if (!is.null(peer_IVs)){
    keepcols = c(DV,grep('\\*',peer_IVs, invert=TRUE,value=TRUE),controls,group,time,weight,cluster)
  } else {
    keepcols = c(DV,controls,group,time,weight,cluster)    
  }
  sample_selection = complete.cases(reg_data[,..keepcols])
  model_data = reg_data[sample_selection,]
  
  if (is.null(cluster)) cluster = '0'

  model = as.formula(paste0(DV,"~",paste0(c(peer_IVs,controls),collapse = '+'),
      " | ",paste(c(group,time),collapse = "+"),
      " | ","0",
      " | ",cluster))   
  if (!is.null(weight)){
    fixedlm.fit = felm(model,weights=model_data[[weight]],data=model_data)
  } else {
    fixedlm.fit = felm(model,data=model_data)
  } 

  return(fixedlm.fit) 
} 